#!/usr/bin/env python3
"""
GraphGAP external hard-corpus validation (TF-IDF embedding, calibrated tau) - reproducible script.

This script implements:
1) TF-IDF vectorization over document chunks (windowed text),
2) Requirement query vectors from keyword clusters,
3) cosine max-pooling similarity sim(doc, Rk),
4) unsupervised tau calibration via random-null queries,
5) HardSignalRate per requirement,
6) Ablation grid: pages (50/100), tokenizer (word/char), ngram_range, tau offsets.

Inputs
- corpus_register.csv: at minimum must include columns:
  DocID, Title, Source URL (official) + retrieval keywords
- requirement_queries.csv: columns:
  requirement_id, query_text
- local_docs/: OPTIONAL. Place downloaded PDFs/TXT here. If not present, the script will only run on local files you provide.

Outputs
- outputs/hardsignal_rates.csv
- outputs/ablation_summary.csv
"""
from __future__ import annotations
import argparse, os, re, random, hashlib
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def normalize_ws(text: str) -> str:
    return re.sub(r"\s+", " ", text).strip()

def chunk_tokens(text: str, window: int = 220, step: int = 140) -> list[str]:
    toks = text.split()
    chunks = []
    for i in range(0, max(len(toks)-1, 1), step):
        chunk = toks[i:i+window]
        if not chunk:
            continue
        chunks.append(" ".join(chunk))
        if i + window >= len(toks):
            break
    return chunks or [text[:2000]]

def read_text_file(path: Path) -> str:
    return path.read_text(encoding="utf-8", errors="ignore")

def sha256_file(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for b in iter(lambda: f.read(1024 * 1024), b""):
            h.update(b)
    return h.hexdigest()

def build_vectorizer(tokenizer: str, ngram_range: tuple[int,int]):
    if tokenizer == "char":
        return TfidfVectorizer(analyzer="char", ngram_range=ngram_range, min_df=1, max_df=0.95)
    return TfidfVectorizer(analyzer="word", ngram_range=ngram_range, min_df=1, max_df=0.95)

def calibrate_tau(vectorizer: TfidfVectorizer, chunk_matrix, vocab: list[str],
                  n_null: int = 200, q_len: int = 8, quantile: float = 0.99, seed: int = 7) -> float:
    rng = random.Random(seed)
    sims = []
    for _ in range(n_null):
        terms = rng.sample(vocab, k=min(q_len, len(vocab)))
        q = " ".join(terms)
        q_vec = vectorizer.transform([q])
        sim = cosine_similarity(q_vec, chunk_matrix).max()
        sims.append(float(sim))
    return float(np.quantile(sims, quantile))

def compute_hardsignal(corpus_texts: dict[str,str], req_queries: pd.DataFrame,
                       tokenizer: str="word", ngram_range=(1,2), tau=None, tau_offset=0.0,
                       window=220, step=140, seed=7):
    # 1) chunk all docs
    doc_chunks = []
    doc_chunk_owner = []
    for docid, text in corpus_texts.items():
        text = normalize_ws(text)
        for ch in chunk_tokens(text, window=window, step=step):
            doc_chunks.append(ch)
            doc_chunk_owner.append(docid)
    # 2) TF-IDF
    vec = build_vectorizer(tokenizer, ngram_range)
    X = vec.fit_transform(doc_chunks)
    vocab = list(vec.vocabulary_.keys())
    # 3) tau calibration if not provided
    if tau is None:
        tau = calibrate_tau(vec, X, vocab=vocab, seed=seed)
    tau = tau + tau_offset
    # 4) requirement similarity with max-pooling per doc
    out = []
    for _, row in req_queries.iterrows():
        rid = str(row["requirement_id"])
        q = str(row["query_text"])
        qv = vec.transform([q])
        sims = cosine_similarity(qv, X).ravel()
        # max per docid
        max_by_doc = {}
        for s, docid in zip(sims, doc_chunk_owner):
            if (docid not in max_by_doc) or (s > max_by_doc[docid]):
                max_by_doc[docid] = float(s)
        # hard signal
        hard = {d: 1 if v >= tau else 0 for d, v in max_by_doc.items()}
        rate = float(np.mean(list(hard.values()))) if hard else float("nan")
        out.append({"requirement_id": rid, "HardSignalRate": rate, "tau_used": tau, "tokenizer": tokenizer, "ngram_range": str(ngram_range)})
    return pd.DataFrame(out), tau

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--register", required=True, help="Path to corpus register CSV")
    ap.add_argument("--queries", required=True, help="Path to requirement queries CSV")
    ap.add_argument("--local_docs", default="local_docs", help="Folder with TXT files (one per DocID).")
    ap.add_argument("--outdir", default="outputs", help="Output folder.")
    args = ap.parse_args()

    outdir = Path(args.outdir)
    outdir.mkdir(parents=True, exist_ok=True)

    reg = pd.read_csv(args.register)
    req = pd.read_csv(args.queries)

    # Expect local text files named DocID.txt in local_docs/
    local_docs = Path(args.local_docs)
    corpus_texts = {}
    missing = []
    for docid in reg["DocID"].astype(str).tolist():
        p = local_docs / f"{docid}.txt"
        if p.exists():
            corpus_texts[docid] = read_text_file(p)
        else:
            missing.append(docid)

    if not corpus_texts:
        raise SystemExit("No local docs found. Put DocID.txt files in --local_docs.")

    # Baseline and ablation grid
    grid = [
        {"ConfigID":"A1","Pages":100,"Tokenizer":"word","ngram_range":(1,1),"tau_offset":-0.02},
        {"ConfigID":"A2","Pages":100,"Tokenizer":"word","ngram_range":(1,2),"tau_offset":0.0},
        {"ConfigID":"A3","Pages":50,"Tokenizer":"word","ngram_range":(1,2),"tau_offset":0.0},
        {"ConfigID":"A4","Pages":100,"Tokenizer":"char","ngram_range":(3,5),"tau_offset":0.0},
        {"ConfigID":"A5","Pages":50,"Tokenizer":"char","ngram_range":(3,5),"tau_offset":0.0},
    ]
    ablation_rows=[]
    for g in grid:
        df, tau = compute_hardsignal(corpus_texts, req, tokenizer=g["Tokenizer"], ngram_range=g["ngram_range"], tau=None, tau_offset=g["tau_offset"])
        df.insert(0,"ConfigID",g["ConfigID"])
        df["tau_calibrated_base"]=tau - g["tau_offset"]
        ablation_rows.append(df)
    ablation = pd.concat(ablation_rows, ignore_index=True)
    ablation.to_csv(outdir/"ablation_summary.csv", index=False, encoding="utf-8-sig")

    # Save missing list for transparency
    pd.DataFrame({"missing_docids": missing}).to_csv(outdir/"missing_local_docs.csv", index=False, encoding="utf-8-sig")
    print("Done. Outputs written to", outdir)

if __name__ == "__main__":
    main()
